{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import os\n",
    "# for i in range(14):\n",
    "#     os.system(f'wget https://f000.backblazeb2.com/file/malay-dataset/tatabahasa/dataset-tatabahasa-{i}.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "\n",
    "with open('dataset-tatabahasa-1.pkl', 'rb') as fopen:\n",
    "    data = pickle.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['Meskipun', 2],\n",
       " ['Madonna', 2],\n",
       " ['melihat', 2],\n",
       " ['keadaan', 2],\n",
       " ['ini', 2],\n",
       " ['sebagai', 2],\n",
       " ['menggugat', 2],\n",
       " ['imejnya', 2],\n",
       " [',', 2],\n",
       " ['beliau', 2],\n",
       " ['tidak', 2],\n",
       " ['dapat', 2],\n",
       " ['mengambil', 2],\n",
       " ['tindakan', 2],\n",
       " ['undang-undang', 2],\n",
       " ['terhadap', 2],\n",
       " ['majalah-majalah', 2],\n",
       " ['tersebut', 2],\n",
       " ['kerana', 2],\n",
       " ['penerbitan', 2],\n",
       " ['dilakukan', 2],\n",
       " ['dalam', 2],\n",
       " ['cara', 2],\n",
       " ['yang', 2],\n",
       " ['dilihat', 2],\n",
       " ['sebagai', 2],\n",
       " ['di', 2],\n",
       " ['sisi', 2],\n",
       " ['undang-undang', 2],\n",
       " ['media', 2],\n",
       " ['Amerika', 2],\n",
       " ['Syarikat', 2],\n",
       " ['.', 2]]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[6][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('Tertirisnya gambar ini sangat aneh dunia memandangkan keadaannya yang terkenal ke ketika ini .',\n",
       " 'Tertirisnya gambar-gambar ini sangat mengejutkan dunia memandangkan keadaannya yang terkenal pada ketika ini .')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y = ' '.join([w[0] for w in data[5][0]])\n",
    "x = ' '.join([w[0] for w in data[5][1]])\n",
    "x, y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "import re\n",
    "\n",
    "def cleaning(string):\n",
    "    string = string.replace('\\n', ' ').replace('\\t', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "import tensorflow_datasets as tfds\n",
    "from t5.data import preprocessors as prep\n",
    "import functools\n",
    "import t5\n",
    "import gin\n",
    "import sentencepiece as spm\n",
    "from glob import glob\n",
    "import os\n",
    "import json\n",
    "\n",
    "gin.parse_config_file('pretrained_models_base_operative_config.gin')\n",
    "vocab = 'sp10m.cased.ms-en.model'\n",
    "sp = spm.SentencePieceProcessor()\n",
    "sp.Load(vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "pkls = glob('dataset-tatabahasa-*.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset-tatabahasa-5.pkl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 99398/99398 [00:00<00:00, 190782.44it/s]\n",
      "100%|██████████| 500/500 [00:00<00:00, 173849.95it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset-tatabahasa-11.pkl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 99449/99449 [00:00<00:00, 199602.98it/s]\n",
      "100%|██████████| 500/500 [00:00<00:00, 185720.16it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset-tatabahasa-0.pkl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 99438/99438 [00:00<00:00, 192316.53it/s]\n",
      "100%|██████████| 500/500 [00:00<00:00, 186812.04it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset-tatabahasa-9.pkl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 99407/99407 [00:00<00:00, 171218.47it/s]\n",
      "100%|██████████| 500/500 [00:00<00:00, 178785.34it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset-tatabahasa-7.pkl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 99442/99442 [00:00<00:00, 190427.18it/s]\n",
      "100%|██████████| 500/500 [00:00<00:00, 180788.97it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset-tatabahasa-10.pkl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 99398/99398 [00:00<00:00, 185606.59it/s]\n",
      "100%|██████████| 500/500 [00:00<00:00, 180757.80it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset-tatabahasa-1.pkl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 99422/99422 [00:00<00:00, 190631.71it/s]\n",
      "100%|██████████| 500/500 [00:00<00:00, 184592.20it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset-tatabahasa-2.pkl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 99455/99455 [00:00<00:00, 192246.58it/s]\n",
      "100%|██████████| 500/500 [00:00<00:00, 176676.66it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset-tatabahasa-4.pkl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 99435/99435 [00:00<00:00, 187586.21it/s]\n",
      "100%|██████████| 500/500 [00:00<00:00, 174442.85it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset-tatabahasa-12.pkl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 99419/99419 [00:00<00:00, 174538.03it/s]\n",
      "100%|██████████| 500/500 [00:00<00:00, 147261.57it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset-tatabahasa-8.pkl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 99380/99380 [00:00<00:00, 188886.81it/s]\n",
      "100%|██████████| 500/500 [00:00<00:00, 177950.95it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset-tatabahasa-6.pkl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 99406/99406 [00:00<00:00, 191162.50it/s]\n",
      "100%|██████████| 500/500 [00:00<00:00, 186745.50it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset-tatabahasa-13.pkl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 99465/99465 [00:00<00:00, 207128.31it/s]\n",
      "100%|██████████| 500/500 [00:00<00:00, 191906.30it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dataset-tatabahasa-3.pkl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 99426/99426 [00:00<00:00, 180361.67it/s]\n",
      "100%|██████████| 500/500 [00:00<00:00, 188457.23it/s]\n"
     ]
    }
   ],
   "source": [
    "train_X, train_Y, test_X, test_Y = [], [], [], []\n",
    "for file in pkls:\n",
    "    print(file)\n",
    "    with open(file, 'rb') as fopen:\n",
    "        data = pickle.load(fopen)\n",
    "    train, test = train_test_split(data, test_size = 500)\n",
    "    for row in tqdm(train):\n",
    "        y = ' '.join([w[0] for w in row[0]])\n",
    "        x = ' '.join([w[0] for w in row[1]])\n",
    "        train_X.append(x)\n",
    "        train_Y.append(y)\n",
    "        \n",
    "    for row in tqdm(test):\n",
    "        y = ' '.join([w[0] for w in row[0]])\n",
    "        x = ' '.join([w[0] for w in row[1]])\n",
    "        test_X.append(x)\n",
    "        test_Y.append(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1391940, 7000)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(train_X), len(test_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1391940/1391940 [00:31<00:00, 44248.76it/s]\n"
     ]
    }
   ],
   "source": [
    "with tf.io.gfile.GFile('kesalahan-tatabahasa.tsv', \"w\") as outfile:\n",
    "    for i in tqdm(range(len(train_X))):\n",
    "        if len(train_X) and len(train_Y):\n",
    "            l = cleaning(train_X[i])\n",
    "            r = cleaning(train_Y[i])\n",
    "            outfile.write(\"%s\\t%s\\n\" % (l, r))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def kesalahan_dataset(split, shuffle_files = False):\n",
    "    del shuffle_files\n",
    "    ds = tf.data.TextLineDataset(\n",
    "        [\n",
    "            'kesalahan-tatabahasa.tsv'\n",
    "        ]\n",
    "    )\n",
    "\n",
    "    ds = ds.map(\n",
    "        functools.partial(\n",
    "            tf.io.decode_csv,\n",
    "            record_defaults = ['', ''],\n",
    "            field_delim = '\\t',\n",
    "            use_quote_delim = False,\n",
    "        ),\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )\n",
    "    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))\n",
    "    return ds\n",
    "\n",
    "def kesalahan_preprocessor(ds):\n",
    "    def to_inputs_and_targets(ex):\n",
    "        return {\n",
    "            'inputs': tf.strings.join(['kesalahan tatabahasa: ', ex['question']]),\n",
    "            'targets': ex['answer'],\n",
    "        }\n",
    "\n",
    "    return ds.map(\n",
    "        to_inputs_and_targets,\n",
    "        num_parallel_calls = tf.data.experimental.AUTOTUNE,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "t5.data.TaskRegistry.remove('kesalahan_dataset')\n",
    "t5.data.TaskRegistry.add(\n",
    "    'kesalahan_dataset',\n",
    "    dataset_fn = kesalahan_dataset,\n",
    "    splits = ['train'],\n",
    "    text_preprocessor = [kesalahan_preprocessor],\n",
    "    sentencepiece_model_path = vocab,\n",
    "    metric_fns = [t5.evaluation.metrics.accuracy],\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "nq_task = t5.data.TaskRegistry.get(\"kesalahan_dataset\")\n",
    "ds = nq_task.get_dataset(split='paraphrase.tsv', sequence_length={\"inputs\": 1024, \"targets\": 1024})\n",
    "r = tfds.as_numpy(ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'inputs_plaintext': b'kesalahan tatabahasa: Editornya ialah Onn Bin Jaafar ( 1930 - 33 ) , Sayyid Alwi bin Sayyid Shaykh al-Hadi ( 1933 - 34 ) , Sayyid Hussein bin Ali Alsagoff ( 1934 - 1941 ) , dan Cikgu Ibrahim Yaakob ( 1941 ) .',\n",
       " 'inputs': array([ 1946, 31301,    31,  6456,    38,   358,   544,   152,  3514,\n",
       "        17579,    13,     4,  8847,    13,     7,  3403,    13,     5,\n",
       "           13,    14, 10871,   128,  1400,   418,  1922,  1890, 10871,\n",
       "          128,  1400, 26379,  3484,  1781,     7,  3884,   387,    13,\n",
       "            4, 20254,    13,     7,  3472,    13,     5,    13,    14,\n",
       "        10871,   128,  1400,  8060,  1890,  1612,   418,    16,  5524,\n",
       "         1696,    13,     4, 20911,    13,     7, 14390,    13,     5,\n",
       "           13,    14,    22,  6904,  1956,  2884, 26976,    13,     4,\n",
       "        14390,    13,     5,    13,     3,     1]),\n",
       " 'targets_plaintext': b'Editornya ialah Onn Bin Jaafar ( 1930 - 33 ) , Sayyid Alwi bin Sayyid Shaykh al-Hadi ( 1933 - 34 ) , Sayyid Hussein bin Ali Alsagoff ( 1934 - 1941 ) , dan Cikgu Ibrahim Yaakob ( 1941 ) .',\n",
       " 'targets': array([ 6456,    38,   358,   544,   152,  3514, 17579,    13,     4,\n",
       "         8847,    13,     7,  3403,    13,     5,    13,    14, 10871,\n",
       "          128,  1400,   418,  1922,  1890, 10871,   128,  1400, 26379,\n",
       "         3484,  1781,     7,  3884,   387,    13,     4, 20254,    13,\n",
       "            7,  3472,    13,     5,    13,    14, 10871,   128,  1400,\n",
       "         8060,  1890,  1612,   418,    16,  5524,  1696,    13,     4,\n",
       "        20911,    13,     7, 14390,    13,     5,    13,    14,    22,\n",
       "         6904,  1956,  2884, 26976,    13,     4, 14390,    13,     5,\n",
       "           13,     3,     1])}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "next(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('test-set-kesalahan-tatabahasa.json', 'w') as fopen:\n",
    "    json.dump({'X': test_X, 'Y': test_Y}, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
